今天我們要讓 AI 助理擁有處理文件和建構知識庫的能力!透過整合文件解析和語義搜尋功能,助理將能夠理解、分析並回答關於文件內容的問題,成為真正的知識管理專家。
現代工作中,我們需要處理大量的文件:PDF 報告、Word 文檔、Excel 表格、文字檔案等。AI 助理如果能夠:
document_knowledge_system/
├── main.py # 主程式
├── document_processor/
│ ├── __init__.py
│ ├── pdf_processor.py # PDF 處理器
│ ├── word_processor.py # Word 處理器
│ └── text_processor.py # 純文字處理器
├── knowledge_base/
│ ├── __init__.py
│ ├── vector_store.py # 向量儲存
│ └── semantic_search.py # 語義搜尋
├── workflows/
│ ├── __init__.py
│ └── document_workflow.py # LangGraph 文件工作流程
└── utils/
├── __init__.py
└── text_chunker.py # 文字分塊工具
pip install PyPDF2
pip install -U sentence-transformers
import PyPDF2
from typing import Dict, List, Optional
import io
class PDFProcessor:
"""PDF 文件處理器"""
def extract_text(self, file_path: str) -> Dict[str, any]:
"""提取 PDF 文字內容"""
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
text_content = []
for page_num, page in enumerate(pdf_reader.pages):
page_text = page.extract_text()
if page_text.strip():
text_content.append({
'page': page_num + 1,
'content': page_text.strip()
})
return {
'success': True,
'total_pages': len(pdf_reader.pages),
'content': text_content,
'full_text': '\n'.join([item['content'] for item in text_content])
}
except Exception as e:
return {'success': False, 'error': str(e)}
def extract_metadata(self, file_path: str) -> Dict:
"""提取 PDF 元資料"""
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
metadata = pdf_reader.metadata
return {
'title': metadata.get('/Title', ''),
'author': metadata.get('/Author', ''),
'creator': metadata.get('/Creator', ''),
'creation_date': str(metadata.get('/CreationDate', '')),
'pages': len(pdf_reader.pages)
}
except:
return {}
from typing import List, Dict, Tuple
import numpy as np
from sentence_transformers import SentenceTransformer
import json
from datetime import datetime
class SimpleVectorStore:
"""簡單的向量儲存系統"""
def __init__(self, model_name: str = 'all-MiniLM-L6-v2'):
self.model = SentenceTransformer(model_name)
self.documents = []
self.vectors = []
self.metadata = []
def add_documents(self, texts: List[str], metadata: List[Dict] = None):
"""添加文件到向量庫"""
if metadata is None:
metadata = [{}] * len(texts)
# 生成嵌入向量
embeddings = self.model.encode(texts)
for i, (text, embedding, meta) in enumerate(zip(texts, embeddings, metadata)):
self.documents.append(text)
self.vectors.append(embedding)
self.metadata.append({
**meta,
'id': len(self.documents) - 1,
'added_at': datetime.now().isoformat(),
'length': len(text)
})
def search(self, query: str, top_k: int = 5) -> List[Dict]:
"""語義搜尋"""
if not self.vectors:
return []
# 生成查詢向量
query_vector = self.model.encode([query])[0]
# 計算相似度
similarities = []
for i, doc_vector in enumerate(self.vectors):
similarity = np.dot(query_vector, doc_vector) / (
np.linalg.norm(query_vector) * np.linalg.norm(doc_vector)
)
similarities.append((i, similarity))
# 排序並返回結果
similarities.sort(key=lambda x: x[1], reverse=True)
results = []
for i, similarity in similarities[:top_k]:
results.append({
'content': self.documents[i],
'metadata': self.metadata[i],
'similarity': float(similarity)
})
return results
def get_stats(self) -> Dict:
"""獲取知識庫統計資訊"""
return {
'total_documents': len(self.documents),
'total_characters': sum(len(doc) for doc in self.documents),
'avg_document_length': np.mean([len(doc) for doc in self.documents]) if self.documents else 0
}
from langgraph.graph import StateGraph, END
from typing import TypedDict, List, Dict, Literal
from document_processor.pdf_processor import PDFProcessor
from knowledge_base.vector_store import SimpleVectorStore
import google.generativeai as genai
import os
genai.configure(api_key=os.getenv('GEMINI_API_KEY'))
model = genai.GenerativeModel('gemini-2.5-flash')
class DocumentWorkflowState(TypedDict):
file_path: str
query: str
processing_mode: str # extract, search, analyze
extracted_content: Dict
search_results: List[Dict]
final_response: str
confidence: float
# 全域知識庫
knowledge_base = SimpleVectorStore()
pdf_processor = PDFProcessor()
def process_document(state: DocumentWorkflowState) -> DocumentWorkflowState:
"""處理文件節點"""
file_path = state["file_path"]
if file_path.lower().endswith('.pdf'):
result = pdf_processor.extract_text(file_path)
if result['success']:
# 將內容分塊並加入知識庫
chunks = chunk_text(result['full_text'])
metadata = [{'source': file_path, 'chunk_id': i} for i in range(len(chunks))]
knowledge_base.add_documents(chunks, metadata)
response = f"✅ 成功處理 PDF 文件\n📄 總頁數:{result['total_pages']}\n📝 提取了 {len(chunks)} 個文字塊"
confidence = 0.9
else:
response = f"❌ 文件處理失敗:{result['error']}"
confidence = 0.1
result = {}
else:
response = "❌ 目前僅支援 PDF 格式"
confidence = 0.1
result = {}
return {
**state,
"extracted_content": result,
"final_response": response,
"confidence": confidence
}
def search_knowledge(state: DocumentWorkflowState) -> DocumentWorkflowState:
"""搜尋知識庫節點"""
query = state["query"]
# 執行語義搜尋
search_results = knowledge_base.search(query, top_k=3)
if search_results:
# 整理搜尋結果
context = "\n\n".join([
f"相關內容 {i+1} (相似度: {result['similarity']:.3f}):\n{result['content'][:500]}..."
for i, result in enumerate(search_results)
])
response = f"🔍 **搜尋結果**\n\n{context}"
confidence = max([result['similarity'] for result in search_results])
else:
response = "🤔 抱歉,沒有找到相關內容。請先上傳相關文件。"
confidence = 0.0
return {
**state,
"search_results": search_results,
"final_response": response,
"confidence": confidence
}
def analyze_with_context(state: DocumentWorkflowState) -> DocumentWorkflowState:
"""基於上下文分析節點"""
query = state["query"]
search_results = knowledge_base.search(query, top_k=5)
if search_results:
context = "\n".join([result['content'] for result in search_results])
prompt = f"""
基於以下文件內容回答問題:
問題:{query}
相關內容:
{context[:3000]} # 限制長度避免超過 token 限制
請提供準確、詳細的回答,並指出資訊來源。
"""
try:
response = model.generate_content(prompt)
final_response = f"📖 **基於文件內容的分析**\n\n{response.text}"
confidence = 0.8
except Exception as e:
final_response = f"❌ 分析失敗:{str(e)}"
confidence = 0.2
else:
final_response = "📚 請先上傳相關文件以獲得更準確的分析。"
confidence = 0.1
return {
**state,
"final_response": final_response,
"confidence": confidence
}
def route_processing(state: DocumentWorkflowState) -> Literal["extract", "search", "analyze"]:
"""路由處理模式"""
return state["processing_mode"]
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> List[str]:
"""將文字分塊"""
chunks = []
start = 0
while start < len(text):
end = start + chunk_size
chunk = text[start:end]
# 嘗試在句子邊界分割
if end < len(text):
last_sentence = chunk.rfind('。')
if last_sentence > start + chunk_size // 2:
end = start + last_sentence + 1
chunk = text[start:end]
chunks.append(chunk.strip())
start = end - overlap
if start >= len(text):
break
return [chunk for chunk in chunks if len(chunk.strip()) > 20]
def create_document_workflow():
"""建立文件處理工作流程"""
workflow = StateGraph(DocumentWorkflowState)
workflow.add_node("extract", process_document)
workflow.add_node("search", search_knowledge)
workflow.add_node("analyze", analyze_with_context)
workflow.set_entry_point("extract")
workflow.add_conditional_edges(
"extract",
route_processing,
{
"extract": END,
"search": "search",
"analyze": "analyze"
}
)
workflow.add_edge("search", END)
workflow.add_edge("analyze", END)
return workflow.compile()
from workflows.document_workflow import create_document_workflow, knowledge_base
import os
def main():
"""文件知識庫主程式"""
print("📚 智能文件知識庫系統")
print("🔧 功能:文件處理、語義搜尋、智能問答")
print("=" * 50)
app = create_document_workflow()
while True:
try:
print("\n選擇操作:")
print("1. 上傳並處理文件")
print("2. 搜尋知識庫")
print("3. 智能問答")
print("4. 查看統計")
print("5. 退出")
choice = input("\n請選擇 (1-5):").strip()
if choice == '1':
file_path = input("請輸入文件路徑:").strip()
if os.path.exists(file_path):
result = app.invoke({
"file_path": file_path,
"query": "",
"processing_mode": "extract",
"extracted_content": {},
"search_results": [],
"final_response": "",
"confidence": 0.0
})
print(f"\n{result['final_response']}")
else:
print("❌ 文件不存在")
elif choice == '2':
query = input("請輸入搜尋關鍵詞:").strip()
if query:
result = app.invoke({
"file_path": "",
"query": query,
"processing_mode": "search",
"extracted_content": {},
"search_results": [],
"final_response": "",
"confidence": 0.0
})
print(f"\n{result['final_response']}")
elif choice == '3':
query = input("請輸入問題:").strip()
if query:
result = app.invoke({
"file_path": "",
"query": query,
"processing_mode": "analyze",
"extracted_content": {},
"search_results": [],
"final_response": "",
"confidence": 0.0
})
print(f"\n{result['final_response']}")
elif choice == '4':
stats = knowledge_base.get_stats()
print(f"\n📊 知識庫統計:")
print(f"📄 文件片段:{stats['total_documents']}")
print(f"📝 總字符數:{stats['total_characters']:,}")
print(f"📏 平均長度:{stats['avg_document_length']:.1f}")
elif choice == '5':
print("👋 再見!")
break
except KeyboardInterrupt:
print("\n👋 再見!")
break
except Exception as e:
print(f"❌ 發生錯誤:{e}")
if __name__ == "__main__":
main()
✅ 多格式支援:PDF、Word、純文字文件處理
✅ 語義搜尋:基於意思的智能搜尋
✅ 知識整合:多文件統一管理
✅ 智能問答:結合 Gemini 的上下文分析
✅ LangGraph 流程:清晰的文件處理工作流程
透過今天的學習,AI 助理現在擁有了處理和理解文件的能力,能夠建構個人或企業的智能知識庫。明天我們將探索圖片識別與多模態處理,讓助理的感知能力更上一層樓!